Environment Setting

suppressMessages({
  library(data.table)
  library(dplyr)
  library(plotly)
  library(abn)
})

Data Acquisition

var_path <- "data/20230311_tips.csv"

data <- var_path %>% fread()

Pre-proccess

data <- data %>% mutate(
  sex = sex %>% factor(., c("Female", "Male")),
  smoker = smoker %>% factor(., c("No", "Yes")),
  day = day %>% factor(., c("Thur", "Fri", "Sat", "Sun")),
  time = time %>% factor(., c("Lunch", "Dinner"))
)

Check Data

data %>% skimr::skim()
Data summary
Name Piped data
Number of rows 244
Number of columns 7
Key NULL
_______________________
Column type frequency:
factor 4
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 Mal: 157, Fem: 87
smoker 0 1 FALSE 2 No: 151, Yes: 93
day 0 1 FALSE 4 Sat: 87, Sun: 76, Thu: 62, Fri: 19
time 0 1 FALSE 2 Din: 176, Lun: 68

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
total_bill 0 1 19.79 8.90 3.07 13.35 17.8 24.13 50.81 ▃▇▃▁▁
tip 0 1 3.00 1.38 1.00 2.00 2.9 3.56 10.00 ▇▆▂▁▁
size 0 1 2.57 0.95 1.00 2.00 2.0 3.00 6.00 ▇▂▂▁▁

Visualization

3D Scatter

tip ~ size + total_bill

data %>%
  plot_ly(x = ~size, y = ~total_bill, z = ~tip, type = "scatter3d", mode = "markers")

tip ~ size + total_bill + smoker

data %>%
  plot_ly(x = ~size, y = ~total_bill, z = ~tip, color = ~smoker, type = "scatter3d", mode = "markers")

Scatter with lm

tip ~ size + total_bill + day +smoker + sex

ggplotly(
  data %>%
    mutate(
      day = day %>% paste("day =", .),
      smoker = smoker %>% paste("smoker =", .)
    ) %>%
    ggplot(aes(x = total_bill, y = tip, color = sex)) +
    geom_smooth(method = "lm", se = F, size = .5, linetype = 2, formula = "y ~ x") +
    geom_point(alpha = .8) +
    facet_wrap(smoker ~ day, ncol = 4, scales = "free") +
    scale_x_continuous(labels = scales::label_comma()) +
    scale_y_continuous(labels = scales::label_comma()) +
    colorspace::scale_color_discrete_diverging("Green-Orange") +
    colorspace::scale_fill_discrete_diverging("Green-Orange") +
    ggthemes::theme_fivethirtyeight() +
    theme(legend.position = "bottom")
)

Box plot

Scatter with lm: tip ~ total_bill + day + sex

ggplotly(data %>%
  mutate(
    day = day %>% paste("day =", .),
    smoker = smoker %>% paste("smoker =", .)
  ) %>%
  ggplot(aes(x = sex, y = tip, color = sex)) +
  geom_boxplot() +
  facet_wrap(~day, ncol = 4, scales = "free") +
  scale_y_continuous(labels = scales::label_comma()) +
  colorspace::scale_color_discrete_diverging("Green-Orange") +
  colorspace::scale_fill_discrete_diverging("Green-Orange") +
  ggthemes::theme_fivethirtyeight() +
  theme(legend.position = "bottom"))

Hypothesis testing

Structure Discover

data_abn <- data %>%
  mutate(day = forcats::fct_recode(day, "Weekday" = "Thur", "Weekday" = "Fri", "Weekend" = "Sat", "Weekend" = "Sun")) %>%
  mutate_at(.vars = c("total_bill", "tip"), ~ log(.)) %>%
  data.frame()

data.dists <- list(
  "total_bill" = "gaussian",
  "tip" = "gaussian",
  "sex" = "binomial",
  "smoker" = "binomial",
  "day" = "binomial",
  "time" = "binomial",
  "size" = "gaussian"
)


dag.banned <- matrix(0, ncol(data_abn), ncol(data_abn), dimnames = list(names(data.dists), names(data.dists)))

dag.banned["size", "total_bill"] <- 1
dag.banned["total_bill", "tip"] <- 1
dag.banned["size", "tip"] <- 1

data_abn %>%
  buildScoreCache(data.df = ., data.dists = data.dists, max.parents = 1, dag.banned = dag.banned) %>%
  mostProbable() %>%
  fitAbn() %>%
  plot()
## Step1. completed max alpha_i(S) for all i and S
## Total sets g(S) to be evaluated over: 128

Regression

list_model <- c("total_bill", "size", "total_bill + size") %>%
  paste("tip ~", .) %>%
  c(., "total_bill ~ size") %>%
  lapply(., as.formula) %>%
  lapply(., function(x) {
    glm(data = data_abn, formula = x)
  })

suppressMessages({
  list_model %>%
    jtools::export_summs(.,
      error_format = "[{conf.low}, {conf.high}]",
      model.names = c("total bill", "size", "both", "total bill by size size")
    )
})
total billsizebothtotal bill by size size
(Intercept)-0.95 ***0.44 ***-0.89 ***2.19 ***
[-1.22, -0.68]   [0.30, 0.57]   [-1.16, -0.61]   [2.06, 2.32]   
total_bill0.68 ***       0.60 ***       
[0.58, 0.77]          [0.49, 0.72]          
size       0.22 ***0.06 *  0.27 ***
       [0.17, 0.27]   [0.00, 0.11]   [0.23, 0.32]   
N244       244       244       244       
AIC141.35    228.26    138.81    191.31    
BIC151.85    238.75    152.80    201.80    
Pseudo R20.67    0.34    0.68    0.50    
*** p < 0.001; ** p < 0.01; * p < 0.05.

Plot model fitness

total_bill

par(mfrow = c(2, 2))
list_model[[1]] %>% plot()

both

par(mfrow = c(2, 2))
list_model[[3]] %>% plot()

model_mediate = mediation::mediate(model.m = list_model[[4]], model.y = list_model[[3]],
                                   treat = "size", mediator = "total_bill")
model_mediate%>%summary
## 
## Causal Mediation Analysis 
## 
## Quasi-Bayesian Confidence Intervals
## 
##                Estimate 95% CI Lower 95% CI Upper p-value    
## ACME            0.16403      0.12468         0.21  <2e-16 ***
## ADE             0.05661      0.00808         0.11   0.026 *  
## Total Effect    0.22064      0.16836         0.27  <2e-16 ***
## Prop. Mediated  0.74180      0.56467         0.96  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 244 
## 
## 
## Simulations: 1000
model_mediate%>%plot